#1 Intro
Organ: Diaphragm Method: FACS
#2 Preprocessing
##2.1 Load Packages
library(tidyverse) # bundle of package for data scienceRegistered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
[37m── [1mAttaching packages[22m ──────────────────────────────── tidyverse 1.2.1 ──[39m
[37m[32m✔[37m [34mggplot2[37m 3.1.1 [32m✔[37m [34mpurrr [37m 0.3.2
[32m✔[37m [34mtibble [37m 2.1.2 [32m✔[37m [34mdplyr [37m 0.8.1
[32m✔[37m [34mtidyr [37m 0.8.3 [32m✔[37m [34mstringr[37m 1.4.0
[32m✔[37m [34mreadr [37m 1.3.1 [32m✔[37m [34mforcats[37m 0.4.0[39m
[37m── [1mConflicts[22m ─────────────────────────────────── tidyverse_conflicts() ──
[31m✖[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31m✖[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(cowplot) # ggplot addon for publication plot
Attaching package: ‘cowplot’
The following object is masked from ‘package:ggplot2’:
ggsave
library(SingleCellExperiment) #S4 vector for SC dataLoading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter,
Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
setdiff, sort, table, tapply, union, unique, unsplit, which,
which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:tidyr’:
expand
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Attaching package: ‘IRanges’
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
The following object is masked from ‘package:purrr’:
reduce
Loading required package: GenomeInfoDb
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages
'citation("pkgname")'.
Loading required package: DelayedArray
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following objects are masked from ‘package:Biobase’:
anyMissing, rowMedians
The following object is masked from ‘package:dplyr’:
count
Loading required package: BiocParallel
Attaching package: ‘DelayedArray’
The following objects are masked from ‘package:matrixStats’:
colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
The following object is masked from ‘package:purrr’:
simplify
The following objects are masked from ‘package:base’:
aperm, apply, rowsum
library(scater) # Toolkit for SC data analysis
Attaching package: ‘scater’
The following object is masked from ‘package:S4Vectors’:
rename
The following objects are masked from ‘package:dplyr’:
arrange, filter, mutate, rename
The following object is masked from ‘package:stats’:
filter
library(scran)# function for low level SC analysis
library(edgeR) # famous package for DE analysisLoading required package: limma
Attaching package: ‘limma’
The following object is masked from ‘package:scater’:
plotMDS
The following object is masked from ‘package:BiocGenerics’:
plotMA
Attaching package: ‘edgeR’
The following object is masked from ‘package:SingleCellExperiment’:
cpm
library(matrixStats)# package for matrix statistics
library(igraph) # package for graph
Attaching package: ‘igraph’
The following object is masked from ‘package:scater’:
normalize
The following objects are masked from ‘package:DelayedArray’:
path, simplify
The following object is masked from ‘package:GenomicRanges’:
union
The following object is masked from ‘package:IRanges’:
union
The following object is masked from ‘package:S4Vectors’:
union
The following objects are masked from ‘package:BiocGenerics’:
normalize, path, union
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:purrr’:
compose, simplify
The following object is masked from ‘package:tidyr’:
crossing
The following object is masked from ‘package:tibble’:
as_data_frame
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
library(biomaRt) #allows for annotation of genes
library(SC3) #package for consensus clusteringRegistered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
##2.2 Creating SingleCellExperiment Object
read the counts file and add annotations
#setting seed
set.seed(1897)
library(readr)
Diaphragm_counts = read.delim("Diaphragm-counts.csv", sep=",", header=TRUE)
dim(Diaphragm_counts)[1] 23433 952
rownames(Diaphragm_counts) <- Diaphragm_counts[,1]
Diaphragm_counts <- Diaphragm_counts[,-1]
cellIDs <- colnames(Diaphragm_counts)
cell_info <- strsplit(cellIDs, "\\.")
Well <- lapply(cell_info, function(x){x[1]})
Well <- unlist(Well)
Plate <- unlist(lapply(cell_info, function(x){x[2]}))
Mouse <- unlist(lapply(cell_info, function(x){x[3]}))
ann <- read.table("annotations_FACS.csv", sep=",", header=TRUE)
ann <- ann[match(cellIDs, ann[,1]),]
celltype <- ann[,3]
cell_anns <- data.frame(mouse = Mouse, well=Well, type=celltype)
rownames(cell_anns) <- colnames(Diaphragm_counts)show annotated cell types encountered
unique(celltype)[1] skeletal muscle satellite stem cell
[2] mesenchymal stem cell
[3] endothelial cell
[4] B cell
[5] macrophage
[6] <NA>
69 Levels: astrocyte of the cerebral cortex ... unknown
length(unique(celltype))[1] 6
check for spike-in information named ERCC for FACS
rownames(Diaphragm_counts)[grep("^ERCC-", rownames(Diaphragm_counts))] [1] "ERCC-00002" "ERCC-00003" "ERCC-00004" "ERCC-00009" "ERCC-00012"
[6] "ERCC-00013" "ERCC-00014" "ERCC-00016" "ERCC-00017" "ERCC-00019"
[11] "ERCC-00022" "ERCC-00024" "ERCC-00025" "ERCC-00028" "ERCC-00031"
[16] "ERCC-00033" "ERCC-00034" "ERCC-00035" "ERCC-00039" "ERCC-00040"
[21] "ERCC-00041" "ERCC-00042" "ERCC-00043" "ERCC-00044" "ERCC-00046"
[26] "ERCC-00048" "ERCC-00051" "ERCC-00053" "ERCC-00054" "ERCC-00057"
[31] "ERCC-00058" "ERCC-00059" "ERCC-00060" "ERCC-00061" "ERCC-00062"
[36] "ERCC-00067" "ERCC-00069" "ERCC-00071" "ERCC-00073" "ERCC-00074"
[41] "ERCC-00075" "ERCC-00076" "ERCC-00077" "ERCC-00078" "ERCC-00079"
[46] "ERCC-00081" "ERCC-00083" "ERCC-00084" "ERCC-00085" "ERCC-00086"
[51] "ERCC-00092" "ERCC-00095" "ERCC-00096" "ERCC-00097" "ERCC-00098"
[56] "ERCC-00099" "ERCC-00104" "ERCC-00108" "ERCC-00109" "ERCC-00111"
[61] "ERCC-00112" "ERCC-00113" "ERCC-00116" "ERCC-00117" "ERCC-00120"
[66] "ERCC-00123" "ERCC-00126" "ERCC-00130" "ERCC-00131" "ERCC-00134"
[71] "ERCC-00136" "ERCC-00137" "ERCC-00138" "ERCC-00142" "ERCC-00143"
[76] "ERCC-00144" "ERCC-00145" "ERCC-00147" "ERCC-00148" "ERCC-00150"
[81] "ERCC-00154" "ERCC-00156" "ERCC-00157" "ERCC-00158" "ERCC-00160"
[86] "ERCC-00162" "ERCC-00163" "ERCC-00164" "ERCC-00165" "ERCC-00168"
[91] "ERCC-00170" "ERCC-00171"
check for presence of ribosomal genes
rownames(Diaphragm_counts)[grep("^Rp[sl][[:digit:]]", rownames(Diaphragm_counts))] [1] "Rpl10" "Rpl10a" "Rpl10l" "Rpl11" "Rpl12"
[6] "Rpl13" "Rpl13a" "Rpl14" "Rpl15" "Rpl17"
[11] "Rpl18" "Rpl18a" "Rpl19" "Rpl21" "Rpl22"
[16] "Rpl22l1" "Rpl23" "Rpl23a" "Rpl24" "Rpl26"
[21] "Rpl27" "Rpl27a" "Rpl28" "Rpl29" "Rpl3"
[26] "Rpl30" "Rpl31" "Rpl31-ps12" "Rpl32" "Rpl34"
[31] "Rpl34-ps1" "Rpl35" "Rpl35a" "Rpl36" "Rpl36a"
[36] "Rpl36al" "Rpl37" "Rpl37a" "Rpl38" "Rpl39"
[41] "Rpl39l" "Rpl3l" "Rpl4" "Rpl41" "Rpl5"
[46] "Rpl6" "Rpl7" "Rpl7a" "Rpl7l1" "Rpl8"
[51] "Rpl9" "Rps10" "Rps11" "Rps12" "Rps13"
[56] "Rps14" "Rps15" "Rps15a" "Rps15a-ps4" "Rps15a-ps6"
[61] "Rps16" "Rps17" "Rps18" "Rps19" "Rps19-ps3"
[66] "Rps19bp1" "Rps2" "Rps20" "Rps21" "Rps23"
[71] "Rps24" "Rps25" "Rps26" "Rps27" "Rps27a"
[76] "Rps27l" "Rps28" "Rps29" "Rps3" "Rps3a"
[81] "Rps4x" "Rps4y2" "Rps5" "Rps6" "Rps6ka1"
[86] "Rps6ka2" "Rps6ka3" "Rps6ka4" "Rps6ka5" "Rps6ka6"
[91] "Rps6kb1" "Rps6kb2" "Rps6kc1" "Rps6kl1" "Rps7"
[96] "Rps8" "Rps9"
ribo.genes <- grep(pattern = "^Rp[sl][[:digit:]]", x = rownames(Diaphragm_counts), value = TRUE)
length(ribo.genes)[1] 97
create S4 object with extrated data
diaph_sce <- SingleCellExperiment(assays = list(counts = as.matrix(Diaphragm_counts)), colData=cell_anns)
head(diaph_sce)class: SingleCellExperiment
dim: 6 951
metadata(0):
assays(1): counts
rownames(6): 0610005C13Rik 0610007C21Rik ... 0610007P08Rik
0610007P14Rik
rowData names(0):
colnames(951): A8.D042105.3_11_M.1.1 K10.D042105.3_11_M.1.1 ...
N12.B002762.3_39_F.1.1 O21.B002762.3_39_F.1.1
colData names(3): mouse well type
reducedDimNames(0):
spikeNames(0):
move ERCC spike data to its own category, to study its behaviour separately
isSpike(diaph_sce, "ERCC") <- grepl("^ERCC-", rownames(diaph_sce))table(isSpike(diaph_sce, "ERCC"))
FALSE TRUE
23341 92
spikeNames(diaph_sce)[1] "ERCC"
move ribosomal genes to its own spike category: Ribosomal genes do not contribute to heterogeneity of cells. They could be present in any cell, so I separated them in order to leave them out later
isSpike(diaph_sce, "ribo") <- grepl("^^Rp[sl][[:digit:]]", rownames(diaph_sce))table(isSpike(diaph_sce, "ribo"))
FALSE TRUE
23336 97
spikeNames(diaph_sce)[1] "ERCC" "ribo"
head(diaph_sce)class: SingleCellExperiment
dim: 6 951
metadata(0):
assays(1): counts
rownames(6): 0610005C13Rik 0610007C21Rik ... 0610007P08Rik
0610007P14Rik
rowData names(0):
colnames(951): A8.D042105.3_11_M.1.1 K10.D042105.3_11_M.1.1 ...
N12.B002762.3_39_F.1.1 O21.B002762.3_39_F.1.1
colData names(3): mouse well type
reducedDimNames(0):
spikeNames(2): ERCC ribo
dim(diaph_sce)[1] 23433 951
is.ercc <- isSpike(diaph_sce, "ERCC")
is.ribo <- isSpike(diaph_sce, "ribo")table(isSpike(diaph_sce))
FALSE TRUE
23244 189
create a copy sce
my_diaph_sce <- diaph_sce#3 Feature Selection
##3.1 Basic gene filtering
###3.1.1 Gene expression profile
Average expression of a gene and the variance in read counts:
#Calculate gene mean across cell
gene_mean <- rowMeans(counts(diaph_sce))
#Calculate gene variance across cell
gene_var <- rowVars(counts(diaph_sce))
#ggplot plot
gene_stat_df <- tibble(gene_mean,gene_var)
ggplot(data=gene_stat_df ,aes(x=log(gene_mean), y=log(gene_var))) + geom_point(size=0.5) + theme_classic()###3.1.2 Filtering out low abundance genes
Show min and max of gene_mean and initial number of cells:
print(paste0( "min gene_mean = ",min(gene_mean)))[1] "min gene_mean = 0"
print(paste0( "max gene_mean = ",max(gene_mean)))[1] "max gene_mean = 54951.1261829653"
numcells <- nexprs(diaph_sce, byrow=TRUE)
print(paste0("initial number of cells: ",length(numcells)))[1] "initial number of cells: 23433"
Choose log-transformed values in plots to better view really low and high values at once.
Plot initial counts and highlight spike groups:
smoothScatter(log10(gene_mean), numcells,xlab=expression(Log[10]~"average count"),ylab="Number of expressing cells")
points(log10(gene_mean[is.ercc]), numcells[is.ercc], col="red", pch= 16,cex=0.5)points(log10(gene_mean[is.ribo]), numcells[is.ribo], col="green", pch= 16,cex=0.5)ERCC spike data are well embedded in the counts, whereas ribosomal genes behave like outliers.
Remove low abundance genes, show how many to keep:
abundant_genes <- gene_mean > 1
print(paste0("abundant genes: ",length(abundant_genes)))[1] "abundant genes: 23433"
print(paste0("genes to keep: ",sum(abundant_genes)))[1] "genes to keep: 11498"
plot low abundance gene filtering
hist(log2(gene_mean), breaks=100, main="", col="grey80",
xlab=expression(Log[2]~"average count"))
abline(v=log2(0.1), col="red", lwd=2, lty=2)remove low abundance genes in SingleCellExperiment Object
diaph_sce<- diaph_sce[abundant_genes,]
dim(diaph_sce)[1] 11498 951
head(diaph_sce)class: SingleCellExperiment
dim: 6 951
metadata(0):
assays(1): counts
rownames(6): 0610007C21Rik 0610007L01Rik ... 0610007P14Rik
0610007P22Rik
rowData names(0):
colnames(951): A8.D042105.3_11_M.1.1 K10.D042105.3_11_M.1.1 ...
N12.B002762.3_39_F.1.1 O21.B002762.3_39_F.1.1
colData names(3): mouse well type
reducedDimNames(0):
spikeNames(2): ERCC ribo
###3.1.3 Filtering genes that are expressed in very few cells
Calculate the number of non zero expression for each gene
numcells <- nexprs(diaph_sce, byrow=TRUE)
length(numcells)[1] 11498
Filter genes detected in less than 5 cells
numcells2 <- numcells >= 5
diaph_sce <- diaph_sce[numcells2,]
dim(diaph_sce)[1] 11456 951
###3.1.4 Plot the distributions of library sizes, numbers of genes expressed and ribosomal genes expressed, now we have filtered the data.
This function calculates useful quality control metrics to help with pre-processing of data and identification of potentially problematic features and cells.
diaph_sce <- calculateQCMetrics(diaph_sce)colnames(colData(diaph_sce)) [1] "mouse"
[2] "well"
[3] "type"
[4] "is_cell_control"
[5] "total_features_by_counts"
[6] "log10_total_features_by_counts"
[7] "total_counts"
[8] "log10_total_counts"
[9] "pct_counts_in_top_50_features"
[10] "pct_counts_in_top_100_features"
[11] "pct_counts_in_top_200_features"
[12] "pct_counts_in_top_500_features"
[13] "total_features_by_counts_endogenous"
[14] "log10_total_features_by_counts_endogenous"
[15] "total_counts_endogenous"
[16] "log10_total_counts_endogenous"
[17] "pct_counts_endogenous"
[18] "pct_counts_in_top_50_features_endogenous"
[19] "pct_counts_in_top_100_features_endogenous"
[20] "pct_counts_in_top_200_features_endogenous"
[21] "pct_counts_in_top_500_features_endogenous"
[22] "total_features_by_counts_feature_control"
[23] "log10_total_features_by_counts_feature_control"
[24] "total_counts_feature_control"
[25] "log10_total_counts_feature_control"
[26] "pct_counts_feature_control"
[27] "pct_counts_in_top_50_features_feature_control"
[28] "pct_counts_in_top_100_features_feature_control"
[29] "pct_counts_in_top_200_features_feature_control"
[30] "pct_counts_in_top_500_features_feature_control"
[31] "total_features_by_counts_ERCC"
[32] "log10_total_features_by_counts_ERCC"
[33] "total_counts_ERCC"
[34] "log10_total_counts_ERCC"
[35] "pct_counts_ERCC"
[36] "pct_counts_in_top_50_features_ERCC"
[37] "pct_counts_in_top_100_features_ERCC"
[38] "pct_counts_in_top_200_features_ERCC"
[39] "pct_counts_in_top_500_features_ERCC"
[40] "total_features_by_counts_ribo"
[41] "log10_total_features_by_counts_ribo"
[42] "total_counts_ribo"
[43] "log10_total_counts_ribo"
[44] "pct_counts_ribo"
[45] "pct_counts_in_top_50_features_ribo"
[46] "pct_counts_in_top_100_features_ribo"
[47] "pct_counts_in_top_200_features_ribo"
[48] "pct_counts_in_top_500_features_ribo"
Low quality cells = low library sizes (due to tecnhical effects during measurement) Low quality of features = cells with few genes expressed
par(mfrow=c(2,1), mar=c(5.1, 4.1, 0.1, 0.1))
hist(diaph_sce$total_counts/1e6, xlab="Library sizes (millions)", main="",
breaks=20, col="grey80", ylab="Number of cells")
hist(diaph_sce$total_features_by_counts, xlab="Number of expressed genes", main="",
breaks=20, col="grey80", ylab="Number of cells")
Same quality check for ribosomal and ERCC spikes in data: these percetages should increase for low quality cells
par(mfrow=c(1,2),mar=c(5.1, 4.1, 0.1, 0.1))
hist(diaph_sce$total_counts_ribo, xlab="Ribosomal proportion (%)",ylab="Number of cells", breaks=20, main="",col= "grey80")
hist(diaph_sce$total_counts_ERCC, xlab="ERCC proportion (%)",ylab="Number of cells", breaks=20, main="",col= "grey80")###3.1.5.a Filter out low quality cells manually
Pick a threshold: Remove cells more than 2 MAD(median absolute deviation) below the median for library size and total features. The log option is chosen to better judge the smaller values and outliers should be looked for at both tails (“both”), by using the previous plots. Also remove cells more than 3 MAD below the median for ribosomal genes and ERCC spikes, with outliers looked at the heavy tail(“lower”). Finally, view remaining cells:
libsize.drop <- isOutlier(diaph_sce$total_counts, nmads=2, type="both", log=TRUE)
feature.drop <- isOutlier(diaph_sce$total_features_by_counts, nmads=2, type="both",log=TRUE)
ribo.drop <- isOutlier(diaph_sce$pct_counts_ribo, nmads=3, type="lower", log=T)
spike.drop <- isOutlier(diaph_sce$pct_counts_ERCC,nmads=3, type="lower",log=T)
diaph_sce <- diaph_sce[,!(libsize.drop | feature.drop |ribo.drop | spike.drop )]
data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop), ByRibo=sum(ribo.drop), ByERCC=sum(spike.drop), Remaining=ncol(diaph_sce))ByLibSize <int> | ByFeature <int> | ByRibo <int> | ByERCC <int> | Remaining <int> |
|---|---|---|---|---|
| 70 | 75 | 31 | 1 | 833 |
NA###3.1.5.b Filter out low quality cells manually and automatically and compare
Here is another method for selecting outliers.
reads <- calculateQCMetrics(my_diaph_sce, feature_controls = list(ERCC = isSpike(my_diaph_sce, "ERCC"), RIBO = isSpike(my_diaph_sce, "ribo")))spike-in set 'ERCC' overwritten by feature_controls set of the same name
Manually with the help of plots I named outliers by some value that seems to lead to a haevy tail:
plotColData(reads, x = "log10_total_features_by_counts", y = "log10_total_counts", colour_by = "type")hist(reads$log10_total_counts, breaks = 100)
abline(v = 4.8, col = "red")abline(v = 6.1, col = "red")filter_by_total_counts <- (reads$log10_total_counts > 4.8 & reads$log10_total_counts < 6.1)
table(filter_by_total_counts)filter_by_total_counts
FALSE TRUE
85 866
hist(reads$log10_total_features_by_counts, breaks = 100)
abline(v = 3.1, col = "red")abline(v = 3.55, col = "red")filter_by_expr_features <- (reads$log10_total_features_by_counts > 3.1 &reads$log10_total_features_by_counts < 3.55)
table(filter_by_expr_features)filter_by_expr_features
FALSE TRUE
111 840
plotColData(reads, x = "log10_total_features_by_counts", y = "pct_counts_ERCC", colour_by = "type")hist(reads$pct_counts_ERCC, breaks = 100)
abline(v = 3, col = "red")abline(v = 1, col = "red")filter_by_ERCC <- ( reads$type != "NA" & reads$pct_counts_ERCC < 3 & reads$pct_counts_ERCC >1)
table(filter_by_ERCC)filter_by_ERCC
FALSE TRUE
596 348
plotColData(reads, x = "total_features_by_counts", y = "pct_counts_ribo", colour_by = "type")hist(reads$pct_counts_RIBO, breaks = 100)
abline(v = 2.4, col = "red")abline(v = 1.4, col = "red")filter_by_RIBO <- (reads$type != "NA" & reads$pct_counts_RIBO < 2.4 & reads$pct_counts_RIBO >1.4)
table(filter_by_RIBO)filter_by_RIBO
FALSE TRUE
761 179
reads$use <- (filter_by_expr_features & filter_by_total_counts & filter_by_ERCC & filter_by_RIBO )
table(reads$use)
FALSE TRUE
825 126
Automatically detect outliers:
Use PCA on QC metrics to detect outliers
library(mvoutlier)Loading required package: sgeostat
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
sROC 0.1-2 loaded
reads <- runPCA( reads, use_coldata = TRUE, detect_outliers = TRUE)
reducedDimNames(reads)[1] "PCA_coldata"
table(reads$outlier)
FALSE TRUE
878 73
plotReducedDim( reads, use_dimred = "PCA_coldata", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier")Compare manual and automatic selection:
library(limma)
auto <- colnames(reads)[reads$oulier]
man <- colnames(reads)[!reads$use]
venn.diag <- vennCounts(cbind(colnames(reads) %in% auto, colnames(reads) %in% man))
vennDiagram(venn.diag, names = c("Automatic", "Manual"), circle.col = c("blue", "green"))Manual and automatic methods are very close. The manual method gives a similar with the one found in 3.1.5.a, so I keep that one throughout the study.
###3.1.6 Data normalisation
Normalize data in order to eliminate cell-specific biases. With the hypothesis that genes are similarly distributed in all cells, any difference in count size of similarly distributed genes between 2 cells, indicate a bias that needs to be removed by scaling. Size factors are calculated(they represent the amount of counts to be scaled in each library).
#data normalisation using a size factor approach.
diaph_sce <- computeSumFactors(diaph_sce, sizes=c(20, 40, 60, 80))
summary(sizeFactors(diaph_sce)) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.03712 0.31325 0.82001 1.00000 1.49703 3.87496
Total counts over size factor plot: the scatter is caused by the presence of DE(differentialy expressed) genes for different types of cells.
plot(sizeFactors(diaph_sce), diaph_sce$total_counts/1e6, log="xy",ylab="Library size (millions)", xlab="Size factor")Compute size factors for spike groups:
diaph_sce <- computeSpikeFactors(diaph_sce, type="ERCC", general.use=F)
diaph_sce <- computeSpikeFactors(diaph_sce, type="ribo", general.use=F)Plot of size factors of spike groups shows that they behave like noise.
plot(sizeFactors(diaph_sce), diaph_sce$total_counts_feature_control/1e6, log="xy",ylab="Library size (millions)", xlab="Size factor")plot(sizeFactors(diaph_sce), diaph_sce$total_counts_ERCC/1e6, log="xy",ylab="Library size (millions)", xlab="Size factor")plot(sizeFactors(diaph_sce), diaph_sce$total_counts_ribo/1e6, log="xy",ylab="Library size (millions)", xlab="Size factor")Percentage of total counts assigned to the top 30 most highly-abundant features in the dataset. For each gene(feature), each bar represents the percentage of that gene for a single cell, while the circle represents the average across all cells. The different colours indicate the differently annotated cells of the dataset. There are present spike features that suggest they should be removed.
plotHighestExprs(diaph_sce,n=30, colour_cells_by = "type")diaph_sce <- scater::normalize(diaph_sce) #log normalised stored in logcountsAn additional way to normalize this data is to convert it to counts per million (CPM) by dividing each column by its total, then multiplying by 1,000,000.
cpm(diaph_sce) <- log2(calculateCPM(diaph_sce, use_size_factors = F) + 1)###3.1.7 Detecting Highly Variable Genes
Model the technical coefficient of variation as a function of the mean.
out<-technicalCV2(diaph_sce, spike.type=NA, assay.type= "counts")
out$HVG <- (out$FDR<0.05)
as_tibble(out)mean <dbl> | var <dbl> | cv2 <dbl> | trend <dbl> | p.value <dbl> | FDR <dbl> | HVG <lgl> |
|---|---|---|---|---|---|---|
| 1.359858e+02 | 9.539677e+04 | 5.1587715 | 6.876048 | 1.000000e+00 | 1.000000e+00 | FALSE |
| 3.707508e+01 | 2.024596e+04 | 14.7290282 | 14.362917 | 8.543465e-01 | 1.000000e+00 | FALSE |
| 9.143644e+00 | 1.082384e+03 | 12.9462146 | 45.805951 | 1.000000e+00 | 1.000000e+00 | FALSE |
| 4.963121e+00 | 1.743271e+03 | 70.7709749 | 80.961109 | 9.977808e-01 | 1.000000e+00 | FALSE |
| 1.179441e+01 | 3.246263e+03 | 23.3362552 | 36.425823 | 1.000000e+00 | 1.000000e+00 | FALSE |
| 1.180912e+01 | 5.910229e+03 | 42.3808310 | 36.385534 | 4.488537e-03 | 2.054362e-02 | TRUE |
| 4.842387e+00 | 1.500221e+03 | 63.9788339 | 82.878220 | 9.999999e-01 | 1.000000e+00 | FALSE |
| 5.274491e+01 | 1.483752e+04 | 5.3333512 | 11.304941 | 1.000000e+00 | 1.000000e+00 | FALSE |
| 1.581913e+00 | 5.588912e+02 | 223.3377202 | 245.310077 | 9.673790e-01 | 1.000000e+00 | FALSE |
| 4.321465e+00 | 9.830106e+02 | 52.6376186 | 92.378023 | 1.000000e+00 | 1.000000e+00 | FALSE |
Highly variable genes are respsonsible for cell heterogeneity. Use the variance to decompose into biological and technical components. Plot highly variable genes:
ggplot(data = out) + geom_point(aes(x=log2(mean), y=log2(cv2), color=HVG), size=0.5) + geom_point(aes(x=log2(mean), y=log2(trend)), color="red", size=0.1)Retrieve HVG: HVG are the ones with the largest biological components.
out = out[out[, "HVG"] == TRUE,]
out <- out[order(out$cv2, decreasing = T),]
HVG_CV2 <- rownames(diaph_sce)[out$HVG]Another way to separate to the biological componets is by taking into account only the endogenous genes, by leaving out of the calculations the spike information. The total variance of the endogenous genes is influenced primarily by the technical component.
var.fit <- trendVar(diaph_sce,method="loess", use.spikes=F)
var.out <- decomposeVar(diaph_sce, var.fit)Confront the variance of the endogeneous genes with the spike-in genes’ variance: The fit curve passes through most of the ERCC spikes that are technicaly related to most genes, whereas ignores the majority of ribosomal genes that present biological variability.
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)cur.spike1 <- isSpike(diaph_sce,"ERCC")
cur.spike2 <- isSpike(diaph_sce,"ribo")
points(var.out$mean[cur.spike1], var.out$total[cur.spike1], col="red", pch=16)
points(var.out$mean[cur.spike2], var.out$total[cur.spike2], col="green", pch=16)HVGs are genes with biological component >0.5 at a false discovery rate (FDR) of 5%.
hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5),]
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),]
nrow(hvg.out)[1] 1086
Top 20 HVGs:
plotExpression(diaph_sce, rownames(hvg.out)[1:20])Remove ribosomal genes:
isSpike(diaph_sce, "ribo") <- NULL
spikeNames(diaph_sce)[1] "ERCC"
Remove ERCC spikes:
isSpike(diaph_sce, "ERCC") <- NULL
spikeNames(diaph_sce)character(0)
#4 Dimensionality Reduction
Reduce some of the noise of the data by projecting them on lower-dimensional space.
##4.1 PCA
###4.1.1 Perform PCA
The data are log-transformed and normalized by library size (cpm), in order to reduce the 1st component variance and make the distribution more normal.
PCA <- prcomp(t(logcounts(diaph_sce)))
PCA_coord <- PCA$x
colnames(PCA_coord) <- paste0("PC", seq(ncol(PCA_coord)))Visualize data in 2 dimensions: Plot PC1 and PC2
PCA_plot_df <- rownames_to_column(as.data.frame(PCA_coord), "Cell")
PCA_ggplot <- ggplot(PCA_plot_df,aes(x=PC1, y =PC2))+ geom_point(aes(shape=diaph_sce$type, color=diaph_sce$type)) +theme_classic()
PCA_ggplotPlot explained Variances
screeplot(PCA, type="lines", npcs = 20)Significant components are around 10, before the curve’s plateau.
##4.2 tSNE
After the visualisation of the cell heterogeneity, we want to characterise the different subgroups of cells.
###4.2.1 Perform tSNE with SCE Object
markers that are specific to skeletal cells
skeletal.cell.markers <- c('Vcam1','Itga7','Calcr','Pax7','Myod1')markers for mesenchymal cells
mesenchymal.cell.markers <- c('Pdgfra')markers for endothelial cells
endothelial.cell.markers <- c("Pecam1")markers for lemphocyte cells
lemphocyte.cell.markers <- c("Cd69","Cd19","Cd79a")markers for macrophage cells
macro.cell.markers <- c("Ptprc", "Itgam","Fcer1g")Running TSNE
diaph_sce <- runTSNE(diaph_sce, perplexity= 10)
plotTSNE(diaph_sce, colour_by="total_counts",
size_by="total_features_by_counts", shape_by = "type")Running TSNE multiple times will result in different plots. Perplexity is the factor that changes the clusters’ shapes.
diaph_sceOther <- diaph_sce
diaph_sceOther <- runTSNE(diaph_sceOther, perplexity= 30)
plotTSNE(diaph_sceOther, colour_by="total_counts",
size_by="total_features_by_counts", shape_by = "type")diaph_sceOther <- runTSNE(diaph_sceOther, perplexity= 80)
plotTSNE(diaph_sceOther, colour_by="total_counts",
size_by="total_features_by_counts", shape_by = "type")diaph_sceOther <- runTSNE(diaph_sceOther, perplexity= 10)
plotTSNE(diaph_sceOther, colour_by="total_counts",
size_by="total_features_by_counts", shape_by = "type")After running for different perplexity values, the value of 10 keeps the largest distances between clusters and separates best the different types. So I will be using the diaph_sce that is not influenced by the above reruns of TSNE.
Lets see what cell types we have present using some of our key markers
plotTSNE(diaph_sce, colour_by="Vcam1", shape_by = "type")plotTSNE(diaph_sce, colour_by="Pdgfra", shape_by = "type")
plotTSNE(diaph_sce, colour_by="Pecam1", shape_by = "type")plotTSNE(diaph_sce, colour_by="Cd79a", shape_by = "type")plotTSNE(diaph_sce, colour_by="Itgam", shape_by = "type")#5 Clustering
Unsuperrvised clustering: create clusters with no prior knowledge of cell types, by only considering similarities of their features. The number of clusters is generally unknown.
##5.1 Basic Clustering Methods
###5.1.1 Hierarchical Clustering
Calculate Distances, compare and create a hierarchy:
distance <- dist(t(logcounts(diaph_sce)))
ward_hclust <-hclust(distance,method = "ward.D2")
par(mfrow=c(1,1))
plot(ward_hclust, labels = F, hang = 0.005)Decide to cut the cluster tree: make 5 groups
cluster_hclust <- cutree(ward_hclust,k = 5)
colData(diaph_sce)$cluster_hclust <- factor(cluster_hclust)
plot_grid(plotTSNE(diaph_sce,colour_by = "cluster_hclust"))The decision to cut the tree in 5 groups was made by considering the awaited 5 types of cells which are clearly clustered separately.
###5.1.2 TSNE + Kmeans
Run TSNE
diaph_sce <- runTSNE(diaph_sce,perplexity= 10)Do kmeans algorithm on TSNE coordinates
deng_kmeans <- kmeans(x = diaph_sce@reducedDims$TSNE,centers = 5)
TSNE_kmeans <- factor(deng_kmeans$cluster)
colData(diaph_sce)$TSNE_kmeans <- TSNE_kmeansCompare with hierarchical clustering
plot_grid(plotTSNE(diaph_sce, colour_by = "TSNE_kmeans"),
plotTSNE(diaph_sce, colour_by = "cluster_hclust"))Both clustering methods reveal the same clusters.
#6 Put genes information into SCE Object
rowData(diaph_sce)$feature_symbol <- rownames(diaph_sce)
diaph_sce <- sc3(diaph_sce, ks = 5,n_cores = detectCores() - 2)Setting SC3 parameters...
Calculating distances between the cells...
starting worker pid=4288 on localhost:11439 at 20:40:47.240
starting worker pid=4302 on localhost:11439 at 20:40:48.255
Loading required package: SC3
Loading required package: SC3
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
loaded SC3 and set parent environment
loaded SC3 and set parent environment
Loading required package: foreach
Loading required package: foreach
Loading required package: rngtools
Loading required package: rngtools
Loading required package: pkgmaker
Loading required package: pkgmaker
Loading required package: registry
Loading required package: registry
Attaching package: ‘pkgmaker’
Attaching package: ‘pkgmaker’
The following object is masked from ‘package:base’:
isFALSE
The following object is masked from ‘package:base’:
isFALSE
Performing transformations and calculating eigenvectors...
starting worker pid=4352 on localhost:11439 at 20:42:47.558
starting worker pid=4368 on localhost:11439 at 20:42:50.277
Loading required package: SC3
Loading required package: SC3
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
loaded SC3 and set parent environment
loaded SC3 and set parent environment
Loading required package: foreach
Loading required package: foreach
Loading required package: rngtools
Loading required package: rngtools
Loading required package: pkgmaker
Loading required package: pkgmaker
Loading required package: registry
Loading required package: registry
Attaching package: ‘pkgmaker’
Attaching package: ‘pkgmaker’
The following object is masked from ‘package:base’:
isFALSE
The following object is masked from ‘package:base’:
isFALSE
Performing k-means clustering...
starting worker pid=4399 on localhost:11439 at 20:43:21.554
starting worker pid=4413 on localhost:11439 at 20:43:21.769
Loading required package: SC3
Loading required package: SC3
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
loaded SC3 and set parent environment
Loading required package: foreach
Loading required package: rngtools
Loading required package: pkgmaker
Loading required package: registry
Attaching package: ‘pkgmaker’
The following object is masked from ‘package:base’:
isFALSE
loaded SC3 and set parent environment
Loading required package: foreach
Loading required package: rngtools
Loading required package: pkgmaker
Loading required package: registry
Attaching package: ‘pkgmaker’
The following object is masked from ‘package:base’:
isFALSE
|
|
| | 0%
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|==== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 15%
|
|========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 22%
|
|=============== | 24%
|
|================ | 25%
|
|================ | 26%
|
|================= | 27%
|
|================== | 28%
|
|================== | 29%
|
|=================== | 30%
|
|==================== | 31%
|
|===================== | 33%
|
|===================== | 34%
|
|====================== | 35%
|
|======================= | 36%
|
|======================= | 37%
|
|======================== | 38%
|
|========================= | 39%
|
|========================= | 40%
|
|========================== | 42%
|
|=========================== | 43%
|
|============================ | 44%
|
|============================ | 45%
|
|============================= | 46%
|
|============================== | 47%
|
|============================== | 48%
|
|=============================== | 49%
|
|================================ | 51%
|
|================================= | 52%
|
|================================= | 53%
|
|================================== | 54%
|
|=================================== | 55%
|
|=================================== | 56%
|
|==================================== | 57%
|
|===================================== | 58%
|
|====================================== | 60%
|
|====================================== | 61%
|
|======================================= | 62%
|
|======================================== | 63%
|
|======================================== | 64%
|
|========================================= | 65%
|
|========================================== | 66%
|
|========================================== | 67%
|
|=========================================== | 69%
|
|============================================ | 70%
|
|============================================= | 71%
|
|============================================= | 72%
|
|============================================== | 73%
|
|=============================================== | 74%
|
|=============================================== | 75%
|
|================================================ | 76%
|
|================================================= | 78%
|
|================================================== | 79%
|
|================================================== | 80%
|
|=================================================== | 81%
|
|==================================================== | 82%
|
|==================================================== | 83%
|
|===================================================== | 84%
|
|====================================================== | 85%
|
|======================================================= | 87%
|
|======================================================= | 88%
|
|======================================================== | 89%
|
|========================================================= | 90%
|
|========================================================= | 91%
|
|========================================================== | 92%
|
|=========================================================== | 93%
|
|=========================================================== | 94%
|
|============================================================ | 96%
|
|============================================================= | 97%
|
|============================================================== | 98%
|
|============================================================== | 99%
|
|===============================================================| 100%
Warning message:
Quick-TRANSfer stage steps exceeded maximum (= 41650)
Calculating consensus matrix...
starting worker pid=4469 on localhost:11439 at 20:48:12.315
Loading required package: SC3
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
loaded SC3 and set parent environment
Loading required package: foreach
Loading required package: rngtools
Loading required package: pkgmaker
Loading required package: registry
Attaching package: ‘pkgmaker’
The following object is masked from ‘package:base’:
isFALSE
plotTSNE(diaph_sce, colour_by="sc3_5_clusters")plotTSNE(diaph_sce, colour_by="Vcam1")plotTSNE(diaph_sce, colour_by="Pdgfra")plotTSNE(diaph_sce, colour_by="Pecam1")plotTSNE(diaph_sce, colour_by="Cd79a")plotTSNE(diaph_sce, colour_by="Itgam")concensus<-sc3_plot_consensus(diaph_sce, k = 5)Question: In the example above we manually set the number of clusters to 3, but is this the best value we can use? hint: try using the sc3_estimate_k() function
diaph_sce <- sc3_estimate_k(diaph_sce)Estimating k...
str(metadata(diaph_sce)$sc3)List of 8
$ kmeans_iter_max: num 1e+09
$ kmeans_nstart : num 1000
$ n_dim : int [1:15] 50 39 52 34 55 46 53 37 59 33 ...
$ rand_seed : num 1
$ n_cores : num 2
$ transformations:List of 6
..$ euclidean_pca : num [1:833, 1:59] 4.45e-02 -7.96e-03 -1.97e-05 4.48e-02 3.60e-02 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : NULL
.. .. ..$ : chr [1:59] "PC1" "PC2" "PC3" "PC4" ...
..$ pearson_pca : num [1:833, 1:59] 0.0389 0.0268 0.0261 -0.0196 0.0192 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:833] "A8.D042105.3_11_M.1.1" "K10.D042105.3_11_M.1.1" "L13.D042105.3_11_M.1.1" "M15.D042105.3_11_M.1.1" ...
.. .. ..$ : chr [1:59] "PC1" "PC2" "PC3" "PC4" ...
..$ spearman_pca : num [1:833, 1:59] 0.037 0.0274 0.0273 -0.022 0.0165 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:833] "A8.D042105.3_11_M.1.1" "K10.D042105.3_11_M.1.1" "L13.D042105.3_11_M.1.1" "M15.D042105.3_11_M.1.1" ...
.. .. ..$ : chr [1:59] "PC1" "PC2" "PC3" "PC4" ...
..$ euclidean_laplacian: num [1:833, 1:59] -0.0346 -0.0343 -0.0345 -0.0352 -0.0345 ...
..$ pearson_laplacian : num [1:833, 1:59] -0.0344 -0.0352 -0.035 -0.0328 -0.0344 ...
..$ spearman_laplacian : num [1:833, 1:59] -0.0344 -0.0351 -0.0349 -0.0331 -0.0344 ...
..- attr(*, "rng")=List of 6
.. ..$ : int [1:7] 10407 1925952071 -143499812 1396004141 -1405780470 474378243 -1474635416
.. ..$ : int [1:7] 10407 -1026591922 -1944015743 -1106275405 1814212950 -367663613 -90021287
.. ..$ : int [1:7] 10407 -1874118155 -583966373 1610586893 1326252861 1580748345 -507446229
.. ..$ : int [1:7] 10407 -1253593145 809057222 -1365155196 19341051 2109970545 -989228701
.. ..$ : int [1:7] 10407 490662591 1568940018 -1836223833 31852861 -1952524165 1246995627
.. ..$ : int [1:7] 10407 2097953577 -869847254 47889187 1178703489 -1406044897 -2118043632
$ consensus :List of 1
..$ 5:List of 3
.. ..$ consensus : num [1:833, 1:833] 1 0 0 0 0.233 ...
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:833] "1" "2" "3" "4" ...
.. .. .. ..$ : chr [1:833] "1" "2" "3" "4" ...
.. ..$ hc :List of 7
.. .. ..$ merge : int [1:832, 1:2] -1 -9 -10 -11 -12 -13 -14 -15 -16 -17 ...
.. .. ..$ height : num [1:832] 0 0 0 0 0 0 0 0 0 0 ...
.. .. ..$ order : int [1:833] 171 221 724 494 364 362 360 359 357 356 ...
.. .. ..$ labels : chr [1:833] "1" "2" "3" "4" ...
.. .. ..$ method : chr "complete"
.. .. ..$ call : language stats::hclust(d = diss)
.. .. ..$ dist.method: NULL
.. .. ..- attr(*, "class")= chr "hclust"
.. ..$ silhouette: 'silhouette' num [1:833, 1:3] 1 2 2 3 3 4 5 1 1 1 ...
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : NULL
.. .. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
.. .. ..- attr(*, "Ordered")= logi FALSE
.. .. ..- attr(*, "call")= language silhouette.default(x = clusts, dist = diss)
$ k_estimation : num 12
sc3_plot_consensus(diaph_sce, k = 5, show_pdata = c("type", "log10_total_features", "sc3_5_clusters", "sc3_5_log2_outlier_score" ))Provided columns 'log10_total_features', 'sc3_5_log2_outlier_score' do not exist in the phenoData table!
sc3_plot_expression(diaph_sce, k = 5, show_pdata = c("type", "log10_total_features", "sc3_5_clusters", "sc3_5_log2_outlier_score" ))Provided columns 'log10_total_features', 'sc3_5_log2_outlier_score' do not exist in the phenoData table!
#7 Differential Expression
Find marker genes characterising the different groups by conducting differential expression analysis on the expression matrix.
##7.1 Wilcoxon Test
###7.1.1 Testing one group against another
Test one group against another
pValsgr <- apply(
counts(diaph_sce), 1, function(x) {
wilcox.test(
x[colData(diaph_sce)$sc3_5_clusters == 1], #1st cluster
x[colData(diaph_sce)$sc3_5_clusters == 2], #2nd cluster
alternative = "greater"
)$p.value
}
)pValsfdr <- p.adjust(pValsgr, method = "fdr")sort genes by fdr and filter non significant genes
DE_genesfdr <- names(which(sort(pValsfdr) < (0.05)))
head(DE_genesfdr)[1] "Chodl" "Des" "Cd82" "Sdc4" "Ncam1" "Myf5"
plotTSNE(diaph_sce, colour_by = DE_genesfdr[1])Plot Violin
plotExpression(diaph_sce, x = "sc3_5_clusters", colour_by = "sc3_5_clusters",features = DE_genesfdr[1:6])+ theme(axis.text.x = element_text(angle = 90, hjust = 1))other filter methods for p.adjust tests give DE_genes:
print(paste0("fdr = ", length(DE_genesfdr)))[1] "fdr = 495"
pValsBY <- p.adjust(pValsgr, method = "BY")
DE_genesBY <- names(which(sort(pValsBY) < (0.05)))
print(paste0("BY = ", length(DE_genesBY)))[1] "BY = 388"
pValsbon <- p.adjust(pValsgr, method = "bonferroni")
DE_genesbon <- names(which(sort(pValsbon) < (0.05)))
print(paste0("bonferroni = ", length(DE_genesbon)))[1] "bonferroni = 300"
pValshom <- p.adjust(pValsgr, method = "hommel")
DE_geneshom <- names(which(sort(pValshom) < (0.05)))
print(paste0("hommel = ", length(DE_geneshom)))[1] "hommel = 300"
pValshoch <- p.adjust(pValsgr, method = "hochberg")
DE_geneshoch <- names(which(sort(pValshoch) < (0.05)))
print(paste0("hochberg = ", length(DE_geneshoch)))[1] "hochberg = 300"
pValsno <- p.adjust(pValsgr, method = "none")
DE_genesno <- names(which(sort(pValsno) < (0.05)))
print(paste0("none = ", length(DE_genesno)))[1] "none = 904"
frd and BY method seem similar, while bonferroni,hommel,hochberg are more strict. none method provides no filter.
When we change our threshold for significance (current value is set to 0.05)
print(paste0("fdr with 0.05 = ", length(DE_genesfdr)))[1] "fdr with 0.05 = 495"
pValsgrfdr1 <- p.adjust(pValsgr, method = "fdr")
DE_genesgrfdr1 <- names(which(sort(pValsgrfdr1) < (0.001)))
print(paste0("fdr with 0.001 = ", length(DE_genesgrfdr1)))[1] "fdr with 0.001 = 342"
pValsgrfdr2 <- p.adjust(pValsgr, method = "fdr")
DE_genesgrfdr2 <- names(which(sort(pValsgrfdr2) < (0.2)))
print(paste0("fdr with 0.2 = ", length(DE_genesgrfdr2)))[1] "fdr with 0.2 = 630"
###7.1.2 Testing one group against all other
Test one group against all other
pVals <- apply(
counts(diaph_sce), 1, function(x) {
wilcox.test(
x[colData(diaph_sce)$sc3_5_clusters == 1],
x[!(colData(diaph_sce)$sc3_5_clusters == 1)],
alternative = "greater"
)$p.value
}
)multiple testing correction
pValsf <- p.adjust(pVals, method = "fdr")DE_genesf <- names(which(sort(pValsf) < (0.05)))
head(DE_genesf)[1] "Chodl" "Des" "Cd82" "Sdc4" "Ncam1" "Myf5"
plotPCA(diaph_sce, colour_by = DE_genesf[1])plotExpression(diaph_sce,features = DE_genesf[1:6])+ theme(axis.text.x = element_text(angle = 90, hjust = 1))other methods for p.adjust tests give DE_genes:
print(paste0("fdr = ", length(DE_genesf)))[1] "fdr = 786"
pValsBY <- p.adjust(pVals, method = "BY")
DE_genesBY <- names(which(sort(pValsBY) < (0.05)))
print(paste0("BY = ", length(DE_genesBY)))[1] "BY = 610"
pValsbon <- p.adjust(pVals, method = "bonferroni")
DE_genesbon <- names(which(sort(pValsbon) < (0.05)))
print(paste0("bonferroni = ", length(DE_genesbon)))[1] "bonferroni = 462"
pValshom <- p.adjust(pVals, method = "hommel")
DE_geneshom <- names(which(sort(pValshom) < (0.05)))
print(paste0("hommel = ", length(DE_geneshom)))[1] "hommel = 465"
pValshoch <- p.adjust(pVals, method = "hochberg")
DE_geneshoch <- names(which(sort(pValshoch) < (0.05)))
print(paste0("hochberg = ", length(DE_geneshoch)))[1] "hochberg = 464"
pValsno <- p.adjust(pVals, method = "none")
DE_genesno <- names(which(sort(pValsno) < (0.05)))
print(paste0("none = ", length(DE_genesno)))[1] "none = 1223"
Different threshold of significance:
print(paste0("fdr with 0.05 = ", length(DE_genesf)))[1] "fdr with 0.05 = 786"
pValsfdr1 <- p.adjust(pVals, method = "fdr")
DE_genesfdr1 <- names(which(sort(pValsfdr1) < (0.001)))
print(paste0("fdr with 0.001 = ", length(DE_genesfdr1)))[1] "fdr with 0.001 = 543"
pValsfdr2 <- p.adjust(pVals, method = "fdr")
DE_genesfdr2 <- names(which(sort(pValsfdr2) < (0.2)))
print(paste0("fdr with 0.1 = ", length(DE_genesfdr2)))[1] "fdr with 0.1 = 1011"